import matplotlib.pyplot as plt
def open_close_plot():
plt.show()
plt.close()
** Ejemplo: (Código C)**
int resultado = 0;
int i;
for(i=0; i<100; i++){
resultado += i;
}
printf("%i",resultado);
** Código Python**
resultado = 0
for i in range(100):
resultado += i
print(resultado)
4950
** Nota:** En C resultado es un entero en Python es un objeto de la clase entero
** Otro ejemplo (Código Python)**
x = 4
x = "cuatro"
** Código C**
int x = 4;
x = "cuatro"; // FALLA
Otro ejemplo son las listas de Python:
L3 = [True, "2", 3.0, 4]
print([type(i) for i in L3])
[<class 'bool'>, <class 'str'>, <class 'float'>, <class 'int'>]
Importante: A veces por razones de eficiencia conviene dar tipo a los datos
import numpy as np
** Arreglo de tipo entero por defecto**
print(np.array([1, 4, 2, 5, 3]))
[1 4 2 5 3]
** Arreglo de tipo flotante**
print(np.array([1, 2, 3, 4], dtype='float32'))
[1. 2. 3. 4.]
** Otros ejemplos de arreglos en numpy**
print(np.array([range(i, i + 3) for i in [2, 4, 6]]))
[[2 3 4]
[4 5 6]
[6 7 8]]
print(np.zeros(10, dtype=int))
[0 0 0 0 0 0 0 0 0 0]
print(np.ones((3, 5), dtype=float))
[[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]]
print(np.full((3, 5), 3.14))
[[3.14 3.14 3.14 3.14 3.14]
[3.14 3.14 3.14 3.14 3.14]
[3.14 3.14 3.14 3.14 3.14]]
print(np.arange(0, 20, 2))
[ 0 2 4 6 8 10 12 14 16 18]
print(np.random.random((3, 3)))
[[0.87490222 0.70684601 0.74370168]
[0.03223021 0.53697138 0.5483331 ]
[0.45390402 0.04325718 0.85697743]]
print(np.random.normal(0, 1, (3, 3)))
[[ 0.4275335 0.63773313 -1.05314398]
[ 1.85175662 0.44403766 1.06917008]
[ 0.22120495 0.34473847 0.7358469 ]]
** Operaciones en un Arreglo**
x = np.arange(4)
print("x =", x)
x = [0 1 2 3]
print("x + 5 =", x + 5)
x + 5 = [5 6 7 8]
print("x - 5 =", x - 5)
x - 5 = [-5 -4 -3 -2]
print("x * 2 =", x * 2)
x * 2 = [0 2 4 6]
print("x / 2 =", x / 2)
x / 2 = [0. 0.5 1. 1.5]
print("x // 2 =", x // 2) # división entera
x // 2 = [0 0 1 1]
x = np.array([-2, -1, 0, 1, 2])
print(abs(x))
[2 1 0 1 2]
x = [1, 2, 3]
print("x =", x)
x = [1, 2, 3]
print("e^x =", np.exp(x))
e^x = [ 2.71828183 7.3890561 20.08553692]
print("2^x =", np.exp2(x))
2^x = [2. 4. 8.]
print("3^x =", np.power(3, x))
3^x = [ 3 9 27]
x = [1, 2, 4, 10]
print("x =", x)
x = [1, 2, 4, 10]
print("ln(x) =", np.log(x))
ln(x) = [0. 0.69314718 1.38629436 2.30258509]
print("log2(x) =", np.log2(x))
log2(x) = [0. 1. 2. 3.32192809]
print("log10(x) =", np.log10(x))
log10(x) = [0. 0.30103 0.60205999 1. ]
L = np.random.random(10000000)
print(sum(L))
4998938.684962408
print(np.sum(L)) # Corre mucho más rápido
4998938.684962775
print(min(L))
2.914314303215093e-07
print(np.min(L)) # Corre mucho más rápido
2.914314303215093e-07
print(max(L))
0.9999999263454943
print(np.max(L)) # Corre mucho más rápido
0.9999999263454943
M = np.random.random((3, 4))
print(M)
[[0.5045625 0.00914483 0.79432422 0.14847579]
[0.94375145 0.73409704 0.05227396 0.13637774]
[0.96618927 0.00388049 0.98816908 0.10469878]]
print(M.sum()) # Es el llamado a un método de nunpy
5.385945154795461
print(M.min(axis=0)) # Por columna
[0.5045625 0.00388049 0.05227396 0.10469878]
print(M.min(axis=1)) # Por fila
[0.00914483 0.05227396 0.00388049]
datos = np.array([189, 170, 189, 163, 183, 171, 185, 168, 173, 183, 173, 173,
175, 178, 183, 193, 178, 173, 174, 183, 183, 168, 170, 178,
182, 180, 183, 178, 182, 188, 175, 179, 183, 193, 182, 183,
177, 185, 188, 188, 182, 185])
print(datos)
[189 170 189 163 183 171 185 168 173 183 173 173 175 178 183 193 178 173
174 183 183 168 170 178 182 180 183 178 182 188 175 179 183 193 182 183
177 185 188 188 182 185]
print("Promedio: ", datos.mean())
Promedio: 179.73809523809524
print("Desviación Estándar: ", datos.std())
Desviación Estándar: 6.931843442745892
print("Mínimo: ", datos.min())
Mínimo: 163
print("Máximo: ", datos.max())
Máximo: 193
print("Percentile 25: ", np.percentile(datos, 25))
Percentile 25: 174.25
print("Mediana : ", np.median(datos))
Mediana : 182.0
print("Percentile 75: ", np.percentile(datos, 75))
Percentile 75: 183.0
** Centrando un arreglo**
X = np.random.random((10, 3))
print(X)
[[4.66170513e-01 3.14488488e-01 1.24027113e-04]
[8.33904050e-01 1.10999079e-01 3.30811423e-01]
[5.97064166e-01 5.61640133e-01 7.00999113e-01]
[8.78734333e-01 2.92396417e-01 6.99788597e-01]
[7.10949142e-01 3.01891423e-01 3.98255841e-01]
[4.35758101e-01 4.24157529e-02 3.14003476e-02]
[1.29161818e-01 9.56245276e-01 2.31027690e-01]
[7.48337026e-01 5.74165646e-01 9.15410773e-01]
[3.30716299e-01 9.95821867e-01 8.61583840e-01]
[1.87635747e-01 6.60882782e-01 4.43945054e-01]]
Xmedia = X.mean(0)
print(Xmedia)
[0.53184312 0.48109469 0.46133467]
X_centrado = X - Xmedia
print(X_centrado)
[[-0.06567261 -0.1666062 -0.46121064]
[ 0.30206093 -0.37009561 -0.13052325]
[ 0.06522105 0.08054545 0.23966444]
[ 0.34689121 -0.18869827 0.23845393]
[ 0.17910602 -0.17920326 -0.06307883]
[-0.09608502 -0.43867893 -0.42993432]
[-0.4026813 0.47515059 -0.23030698]
[ 0.21649391 0.09307096 0.4540761 ]
[-0.20112682 0.51472718 0.40024917]
[-0.34420737 0.1797881 -0.01738962]]
** Operaciones en Arreglos similares a las de R**
x = np.array([1, 2, 3, 4, 5])
print(x < 3)
[ True True False False False]
print(np.less(x,3)) # Internamente se invoca el método less de la clase numpy
[ True True False False False]
print(x > 3)
[False False False True True]
print(x <= 3)
[ True True True False False]
print(x >= 3)
[False False True True True]
print(x != 3)
[ True True False True True]
print(x == 3)
[False False True False False]
rng = np.random.RandomState(0)
x = rng.randint(10, size=(3, 4))
print(x)
[[5 0 3 3]
[7 9 3 5]
[2 4 7 6]]
print(x <= 3)
[[False True True True]
[False False True False]
[ True False False False]]
print(np.count_nonzero(x < 6))
8
** & = AND** y ** | = OR**
print(np.sum(x < 6)) # Es equivalente al anterior False=0 y True=1
8
print(np.sum(x < 6, axis=1)) # Por fila
[4 2 2]
print(np.sum((x > 4) & (x < 6)))
2
print(np.sum(~(x > 4) & (x < 6)))
6
print(np.sum((x > 4) | (x < 6)))
12
# Para saber si hay o no valores más grandes que 8?
print(np.any(x > 8))
True
** Similar que en R**
print(x)
[[5 0 3 3]
[7 9 3 5]
[2 4 7 6]]
print(x[x < 4])
[0 3 3 3 2]
print(np.sum(x[x < 4]))
11
** Operador OR**
A = np.array([1, 0, 1, 0, 1, 0], dtype=bool)
B = np.array([1, 1, 1, 0, 1, 1], dtype=bool)
print(A | B)
[ True True True False True True]
** Arreglos de índices “Fancy Indexing”**
rand = np.random.RandomState(42) # Fija la semilla aleatoria
x = rand.randint(100, size=10)
print(x)
[51 92 14 71 60 20 82 86 74 74]
print([x[3], x[7], x[2]])
[71, 86, 14]
** Es equivalente, pero pasando los indices en un arreglo**
ind = [3, 7, 4]
print(x[ind])
[71 86 60]
ind = np.array([[3, 7],
[4, 5]])
print(x[ind])
[[71 86]
[60 20]]
** Seleccionando puntos al azar**
mean = [0, 0]
cov = [[1, 2],
[2, 5]]
X = rand.multivariate_normal(mean, cov, 100)
print(X)
[[-0.644508 -0.46220608]
[ 0.7376352 1.21236921]
[ 0.88151763 1.12795177]
[ 2.04998983 5.97778598]
[-0.1711348 -2.06258746]
[ 0.67956979 0.83705124]
[ 1.46860232 1.22961093]
[ 0.35282131 1.49875397]
[-2.51552505 -5.64629995]
[ 0.0843329 -0.3543059 ]
[ 0.19199272 1.48901291]
[-0.02566217 -0.74987887]
[ 1.00569227 2.25287315]
[ 0.49514263 1.18939673]
[ 0.0629872 0.57349278]
[ 0.75093031 2.99487004]
[-3.0236127 -6.00766046]
[-0.53943081 -0.3478899 ]
[ 1.53817376 1.99973464]
[-0.50886808 -1.81099656]
[ 1.58115602 2.86410319]
[ 0.99305043 2.54294059]
[-0.87753796 -1.15767204]
[-1.11518048 -1.87508012]
[ 0.4299908 0.36324254]
[ 0.97253528 3.53815717]
[ 0.32124996 0.33137032]
[-0.74618649 -2.77366681]
[-0.88473953 -1.81495444]
[ 0.98783862 2.30280401]
[-1.2033623 -2.04402725]
[-1.51101746 -3.2818741 ]
[-2.76337717 -7.66760648]
[ 0.39158553 0.87949228]
[ 0.91181024 3.32968944]
[-0.84202629 -2.01226547]
[ 1.06586877 0.95500019]
[ 0.44457363 1.87828298]
[ 0.35936721 0.40554974]
[-0.90649669 -0.93486441]
[-0.35790389 -0.52363012]
[-1.33461668 -3.03203218]
[ 0.02815138 0.79654924]
[ 0.37785618 0.51409383]
[-1.06505097 -2.88726779]
[ 2.32083881 5.97698647]
[ 0.47605744 0.83634485]
[-0.35490984 -1.03657119]
[ 0.57532883 -0.79997124]
[ 0.33399913 2.32597923]
[ 0.6575612 -0.22389518]
[ 1.3707365 2.2348831 ]
[ 0.07099548 -0.29685467]
[ 0.6074983 1.47089233]
[-0.34226126 -1.10666237]
[ 0.69226246 1.21504303]
[-0.31112937 -0.75912097]
[-0.26888327 -1.89366817]
[ 0.42044896 1.85189522]
[ 0.21115245 2.00781492]
[-1.83106042 -2.91352836]
[ 0.7841796 1.97640753]
[ 0.10259314 1.24690575]
[-1.91100558 -3.66800923]
[ 0.13143756 -0.07833855]
[-0.1317045 -1.64159158]
[-0.14547282 -1.34125678]
[-0.51172373 -1.40960773]
[ 0.69758045 0.72563649]
[ 0.11677083 0.88385162]
[-1.16586444 -2.24482237]
[-2.23176235 -2.63958101]
[ 0.37857234 0.69112594]
[ 0.87475323 3.400675 ]
[-0.86864365 -3.03568353]
[-1.03637857 -1.18469125]
[-0.53334959 -0.37039911]
[ 0.30414557 -0.5828419 ]
[-1.47656656 -2.13046298]
[-0.31332021 -1.7895623 ]
[ 1.12659538 1.49627535]
[-1.19675798 -1.51633442]
[-0.75210154 -0.79770535]
[ 0.74577693 1.95834451]
[ 1.56094354 2.9330816 ]
[-0.72009966 -1.99780959]
[-1.32319163 -2.61218347]
[-2.56215914 -6.08410838]
[ 1.31256297 3.13143269]
[ 0.51575983 2.30284639]
[ 0.01374713 -0.11539344]
[-0.16863279 0.39422355]
[ 0.12065651 1.13236323]
[-0.83504984 -2.38632016]
[ 1.05185885 1.98418223]
[-0.69144553 -1.56919875]
[-1.2567603 -1.125898 ]
[ 0.09619333 -0.64335574]
[-0.99658689 -2.35038099]
[-1.21405259 -1.77693724]]
** Dimensión de la matriz**
print(X.shape)
(100, 2)
** Modificando valores con arreglos de índices “fancy index”**
x = np.arange(10)
print(x)
[0 1 2 3 4 5 6 7 8 9]
i = np.array([2, 1, 8, 4])
x[i] = 99
print(x)
[ 0 99 99 3 99 5 6 7 99 9]
** Ordenando vectores con numpy**
x = np.array([9, 1, -4, 23, 5])
np.sort(x)
** Que es equivalente invocando el método (orientado a objetos), pues x es un objeto tipo numpy**
x.sort()
print(x)
[-4 1 5 9 23]
** Para obtener los índices del vector ordenado**
x = np.array([9, 1, -4, 23, 5])
i = np.argsort(x)
print(i)
[2 1 4 0 3]
** Ordenando por filas y columnas un arreglo**
rand = np.random.RandomState(42)
X = rand.randint(0, 10, (4, 6))
print(X)
[[6 3 7 4 6 9]
[2 6 7 4 3 7]
[7 2 5 4 1 7]
[5 1 4 0 9 5]]
print(np.sort(X, axis=0)) # Ordena por columna
[[2 1 4 0 1 5]
[5 2 5 4 3 7]
[6 3 7 4 6 7]
[7 6 7 4 9 9]]
print(np.sort(X, axis=1)) # Ordena por fila
[[3 4 6 6 7 9]
[2 3 4 6 7 7]
[1 2 4 5 7 7]
[0 1 4 5 5 9]]
Pandas permite manejar objetos tipo DataFrame en Python, es decir, una matriz con nombres de filas y columnas y un poco más, como series de tiempo.
** Para ver la versión de Pandas que uno tiene**
import pandas as pd
print(pd.__version__)
0.23.4
poblacion = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
area = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}
estados = pd.DataFrame({'Poblacion': poblacion,'Area': area})
print(estados)
Poblacion Area
California 38332521 423967
Florida 19552860 170312
Illinois 12882135 149995
New York 19651127 141297
Texas 26448193 695662
print(estados.index) # Observe que son objetos
Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')
print(estados.columns)
Index(['Poblacion', 'Area'], dtype='object')
print(estados['Area'])
California 423967
Florida 170312
Illinois 149995
New York 141297
Texas 695662
Name: Area, dtype: int64
** Otros ejemplos**
datos = [{'a': i, 'b': 2 * i} for i in range(30)]
print(pd.DataFrame(datos))
a b
0 0 0
1 1 2
2 2 4
3 3 6
4 4 8
5 5 10
6 6 12
7 7 14
8 8 16
9 9 18
10 10 20
11 11 22
12 12 24
13 13 26
14 14 28
15 15 30
16 16 32
17 17 34
18 18 36
19 19 38
20 20 40
21 21 42
22 22 44
23 23 46
24 24 48
25 25 50
26 26 52
27 27 54
28 28 56
29 29 58
print(pd.DataFrame(np.random.rand(3, 2),
columns=['C1', 'C2'],
index=['a', 'b', 'c']))
C1 C2
a 0.620066 0.065418
b 0.966642 0.523054
c 0.374949 0.878036
** Valores nulos en Python “NaN”**
print(pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}]))
a b c
0 1.0 2 NaN
1 NaN 3 4.0
** Índices explícitos e implícitos - Indexadores: loc, iloc**
datos = pd.Series(['a', 'b', 'c'], index=[10, 30, 50])
** Nota:** Los índices explícitos son 10, 30 y 50; mientras que Los índices explícitos son 0, 1 y 2.
print(datos)
## 10 a
## 30 b
## 50 c
## dtype: object
print(datos[10]) # Índice explícito
## a
print(datos[1]) # Da error
## KeyError: 1
##
## Detailed traceback:
## File "<string>", line 1, in <module>
## File "/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/pandas/core/series.py", line 767, in __getitem__
## result = self.index.get_value(self, key)
## File "/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 3118, in get_value
## tz=getattr(series.dtype, 'tz', None))
## File "pandas/_libs/index.pyx", line 106, in pandas._libs.index.IndexEngine.get_value
## File "pandas/_libs/index.pyx", line 114, in pandas._libs.index.IndexEngine.get_value
## File "pandas/_libs/index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
## File "pandas/_libs/hashtable_class_helper.pxi", line 958, in pandas._libs.hashtable.Int64HashTable.get_item
## File "pandas/_libs/hashtable_class_helper.pxi", line 964, in pandas._libs.hashtable.Int64HashTable.get_item
print(datos[0:2]) # Índices implícitos, esto es extraño y confuso
## 10 a
## 30 b
## dtype: object
print(datos[1:3]) # Índices implícitos, esto es extraño y confuso
## 30 b
## 50 c
## dtype: object
** Lo anterior causa mucha confusión, loc siempre se refiere al índice explícito**
print(datos.loc[10])
## a
datos.loc[1] # Da error
## KeyError: 'the label [1] is not in the [index]'
##
## Detailed traceback:
## File "<string>", line 1, in <module>
## File "/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py", line 1478, in __getitem__
## return self._getitem_axis(maybe_callable, axis=axis)
## File "/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py", line 1911, in _getitem_axis
## self._validate_key(key, axis)
## File "/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py", line 1798, in _validate_key
## error()
## File "/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py", line 1785, in error
## axis=self.obj._get_axis_name(axis)))
print(datos.loc[10:30])
## 10 a
## 30 b
## dtype: object
print(datos.loc[30:50])
## 30 b
## 50 c
## dtype: object
** Seleccionando datos en un DataFrame**
poblacion = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
area = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}
estados = pd.DataFrame({'Poblacion': poblacion,'Area': area})
print(estados)
Poblacion Area
California 38332521 423967
Florida 19552860 170312
Illinois 12882135 149995
New York 19651127 141297
Texas 26448193 695662
print(estados['Poblacion'])
California 38332521
Florida 19552860
Illinois 12882135
New York 19651127
Texas 26448193
Name: Poblacion, dtype: int64
print(estados.Poblacion)
California 38332521
Florida 19552860
Illinois 12882135
New York 19651127
Texas 26448193
Name: Poblacion, dtype: int64
** Agregando una variable**
estados['Densidad'] = estados['Poblacion'] / estados['Area']
print(estados)
Poblacion Area Densidad
California 38332521 423967 90.413926
Florida 19552860 170312 114.806121
Illinois 12882135 149995 85.883763
New York 19651127 141297 139.076746
Texas 26448193 695662 38.018740
** Transponer el DataFrame**
ET = estados.T
print(ET)
California Florida ... New York Texas
Poblacion 3.833252e+07 1.955286e+07 ... 1.965113e+07 2.644819e+07
Area 4.239670e+05 1.703120e+05 ... 1.412970e+05 6.956620e+05
Densidad 9.041393e+01 1.148061e+02 ... 1.390767e+02 3.801874e+01
[3 rows x 5 columns]
** Se puede usar iloc y loc en un DataFrame**
estados
print(estados.iloc[:3, :2]) # Índices implícitos
Poblacion Area
California 38332521 423967
Florida 19552860 170312
Illinois 12882135 149995
print(estados.loc[:'Illinois', :'Poblacion']) # Índices Explícitos
Poblacion
California 38332521
Florida 19552860
Illinois 12882135
** Otros ejemplos**
print(estados.loc[estados.Densidad > 100, ['Poblacion', 'Densidad']])
Poblacion Densidad
Florida 19552860 114.806121
New York 19651127 139.076746
print(estados['Florida':'Illinois'])
Poblacion Area Densidad
Florida 19552860 170312 114.806121
Illinois 12882135 149995 85.883763
print(estados[1:3])
Poblacion Area Densidad
Florida 19552860 170312 114.806121
Illinois 12882135 149995 85.883763
print(estados[estados.Densidad > 100])
Poblacion Area Densidad
Florida 19552860 170312 114.806121
New York 19651127 141297 139.076746
** Modificando un dato**
estados.iloc[0, 2] = 0
print(estados)
Poblacion Area Densidad
California 38332521 423967 0.000000
Florida 19552860 170312 114.806121
Illinois 12882135 149995 85.883763
New York 19651127 141297 139.076746
Texas 26448193 695662 38.018740
Pandas hereda todas las funcionalides de numpy, pues Pandas es una especialización de numpy
import pandas as pd
import numpy as np
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),columns=['A', 'B', 'C', 'D'])
print(df)
A B C D
0 8 8 1 6
1 7 7 8 1
2 5 9 8 9
** Aplica seno al DataFrame**
np.sin(df * np.pi / 4)
“None” es el indicador propio de Python para indicar datos ausentes, pero solo funciona con datos que hereden de la clase “object”
valores1 = np.array([1, None, 3, 4])
print(valores1) # Observe que dtype=object
[1 None 3 4]
import numpy as np
valores1 = np.array([1, None, 3, 4],dtype=int) # Causa error el None
## TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
##
## Detailed traceback:
## File "<string>", line 1, in <module>
valores1.sum() # causa error el None
## TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'
##
## Detailed traceback:
## File "<string>", line 1, in <module>
## File "/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/numpy/core/_methods.py", line 36, in _sum
## return umr_sum(a, axis, dtype, out, keepdims, initial)
** Uso del NaN (acrónimo para “Not a Number”)**
valores2 = np.array([1, np.nan, 3, 24])
print(valores2)
[ 1. nan 3. 24.]
print(valores2.sum())
nan
** No da error, da nan porque:**
print(1 + np.nan) # Da nan
nan
valores2.dtype # Ahora es flotante
print(valores2.min())
nan
/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/numpy/core/_methods.py:32: RuntimeWarning: invalid value encountered in reduce
return umr_minimum(a, axis, None, out, keepdims, initial)
print(valores2.max())
nan
/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/numpy/core/_methods.py:28: RuntimeWarning: invalid value encountered in reduce
return umr_maximum(a, axis, None, out, keepdims, initial)
** Todo da nan, solución usar: **
print(np.nansum(valores2))
28.0
print(np.nanmin(valores2))
1.0
print(np.nanmax(valores2))
24.0
NaN and None en Pandas = Básicamente son intercambiables, se convierten a NaN
datos = pd.Series([1, np.nan, 2, None, 90, -10, 76])
print(datos)
0 1.0
1 NaN
2 2.0
3 NaN
4 90.0
5 -10.0
6 76.0
dtype: float64
print(datos.isnull()) # Para detectarlos
0 False
1 True
2 False
3 True
4 False
5 False
6 False
dtype: bool
** Eliminando valores nan, se debe eliminar toda la fila o toda la columna**
df = pd.DataFrame([[1, np.nan, 2],
[2, 3, 5],
[np.nan, 4, 6],
[-1, 94, 0]])
print(df)
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6
3 -1.0 94.0 0
print(df.dropna())# Por defecto es por fila
0 1 2
1 2.0 3.0 5
3 -1.0 94.0 0
print(df.dropna(axis='columns'))
2
0 2
1 5
2 6
3 0
** Ejemplo con un data frame**
df = pd.DataFrame([[1, np.nan, 2],
[2, 3, 5],
[np.nan, 4, 6],
[-1, 94, 0]])
print(df)
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6
3 -1.0 94.0 0
print(df.fillna(0)) # Rellena (imputa el dato) con ceros
0 1 2
0 1.0 0.0 2
1 2.0 3.0 5
2 0.0 4.0 6
3 -1.0 94.0 0
print(df)
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6
3 -1.0 94.0 0
print(df.fillna(method='bfill',axis=1)) # Rellena con en posterior de la fila
0 1 2
0 1.0 2.0 2.0
1 2.0 3.0 5.0
2 4.0 4.0 6.0
3 -1.0 94.0 0.0
print(df)
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6
3 -1.0 94.0 0
print(df.fillna(method='bfill',axis=0)) # Rellena con en posterior de la columna
0 1 2
0 1.0 3.0 2
1 2.0 3.0 5
2 -1.0 4.0 6
3 -1.0 94.0 0
print(df)
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6
3 -1.0 94.0 0
print(df.fillna(method='ffill',axis=1)) # Rellena con en anterior de la fila
0 1 2
0 1.0 1.0 2.0
1 2.0 3.0 5.0
2 NaN 4.0 6.0
3 -1.0 94.0 0.0
print(df)
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6
3 -1.0 94.0 0
print(df.fillna(method='ffill',axis=0)) # Rellena con en anterior de la columna
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 2.0 4.0 6
3 -1.0 94.0 0
print(df)
0 1 2
0 1.0 NaN 2
1 2.0 3.0 5
2 NaN 4.0 6
3 -1.0 94.0 0
print(np.mean(df))
0 0.666667
1 33.666667
2 3.250000
dtype: float64
print(df.fillna(np.mean(df))) # Rellena con la media por columna
0 1 2
0 1.000000 33.666667 2
1 2.000000 3.000000 5
2 0.666667 4.000000 6
3 -1.000000 94.000000 0
** En numpy**
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
print(np.concatenate([x, y, z]))
[1 2 3 4 5 6 7 8 9]
x = [[1, 2],[3, 4]]
y = [[-1, -2],[-3, -4]]
print(np.concatenate([x, y], axis=1))
[[ 1 2 -1 -2]
[ 3 4 -3 -4]]
** En pandas**
** Ejemplo DataFrame por filas**
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
print(df1)
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 D2
3 A3 B3 C3 D3
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
print(df2)
A B C D
4 A4 B4 C4 D4
5 A5 B5 C5 D5
6 A6 B6 C6 D6
7 A7 B7 C7 D7
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])
print(df3)
A B C D
8 A8 B8 C8 D8
9 A9 B9 C9 D9
10 A10 B10 C10 D10
11 A11 B11 C11 D11
resultado = pd.concat([df1, df2, df3])
print(resultado)
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 D2
3 A3 B3 C3 D3
4 A4 B4 C4 D4
5 A5 B5 C5 D5
6 A6 B6 C6 D6
7 A7 B7 C7 D7
8 A8 B8 C8 D8
9 A9 B9 C9 D9
10 A10 B10 C10 D10
11 A11 B11 C11 D11
** Ejemplo DataFrame por columnas**
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
print(df1)
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 D2
3 A3 B3 C3 D3
df2 = pd.DataFrame({'E': ['A4', 'A5', 'A6', 'A7'],
'F': ['B4', 'B5', 'B6', 'B7'],
'G': ['C4', 'C5', 'C6', 'C7'],
'H': ['D4', 'D5', 'D6', 'D7']},
index=[0, 1, 2, 3])
print(df2)
E F G H
0 A4 B4 C4 D4
1 A5 B5 C5 D5
2 A6 B6 C6 D6
3 A7 B7 C7 D7
df3 = pd.DataFrame({'I': ['A8', 'A9', 'A10', 'A11'],
'J': ['B8', 'B9', 'B10', 'B11'],
'K': ['C8', 'C9', 'C10', 'C11'],
'L': ['D8', 'D9', 'D10', 'D11']},
index=[0, 1, 2, 3])
print(df3)
I J K L
0 A8 B8 C8 D8
1 A9 B9 C9 D9
2 A10 B10 C10 D10
3 A11 B11 C11 D11
resultado = pd.concat([df1, df2, df3],axis=1)
print(resultado)
A B C D E F G H I J K L
0 A0 B0 C0 D0 A4 B4 C4 D4 A8 B8 C8 D8
1 A1 B1 C1 D1 A5 B5 C5 D5 A9 B9 C9 D9
2 A2 B2 C2 D2 A6 B6 C6 D6 A10 B10 C10 D10
3 A3 B3 C3 D3 A7 B7 C7 D7 A11 B11 C11 D11
** Nota:** Que pasa si todas las columnas NO son iguales ** Ejemplo DataFrame por filas**
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
print(df1)
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 D2
3 A3 B3 C3 D3
df2 = pd.DataFrame({'C': ['A4', 'A5', 'A6', 'A7'],
'D': ['B4', 'B5', 'B6', 'B7'],
'E': ['C4', 'C5', 'C6', 'C7'],
'F': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
print(df2)
C D E F
4 A4 B4 C4 D4
5 A5 B5 C5 D5
6 A6 B6 C6 D6
7 A7 B7 C7 D7
resultado = pd.concat([df1, df2])
/Users/oldemarrodriguez/anaconda3/bin/python3.7:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
print(resultado)
A B C D E F
0 A0 B0 C0 D0 NaN NaN
1 A1 B1 C1 D1 NaN NaN
2 A2 B2 C2 D2 NaN NaN
3 A3 B3 C3 D3 NaN NaN
4 NaN NaN A4 B4 C4 D4
5 NaN NaN A5 B5 C5 D5
6 NaN NaN A6 B6 C6 D6
7 NaN NaN A7 B7 C7 D7
resultado = pd.concat([df1, df2],join='inner')
print(resultado)
C D
0 C0 D0
1 C1 D1
2 C2 D2
3 C3 D3
4 A4 B4
5 A5 B5
6 A6 B6
7 A7 B7
resultado = pd.concat([df1, df2],join='outer') # Por defecto es outer
print(resultado)
A B C D E F
0 A0 B0 C0 D0 NaN NaN
1 A1 B1 C1 D1 NaN NaN
2 A2 B2 C2 D2 NaN NaN
3 A3 B3 C3 D3 NaN NaN
4 NaN NaN A4 B4 C4 D4
5 NaN NaN A5 B5 C5 D5
6 NaN NaN A6 B6 C6 D6
7 NaN NaN A7 B7 C7 D7
import matplotlib.pyplot as plt
import numpy as np
** Estilo clásico**
plt.style.use('classic')
** Ejemplos de gráficos de líneas (graficando funciones)**
# Datos del eje X para los siguientes gráficos
x = np.linspace(0, 10, 100)
print(x)
[ 0. 0.1010101 0.2020202 0.3030303 0.4040404 0.50505051
0.60606061 0.70707071 0.80808081 0.90909091 1.01010101 1.11111111
1.21212121 1.31313131 1.41414141 1.51515152 1.61616162 1.71717172
1.81818182 1.91919192 2.02020202 2.12121212 2.22222222 2.32323232
2.42424242 2.52525253 2.62626263 2.72727273 2.82828283 2.92929293
3.03030303 3.13131313 3.23232323 3.33333333 3.43434343 3.53535354
3.63636364 3.73737374 3.83838384 3.93939394 4.04040404 4.14141414
4.24242424 4.34343434 4.44444444 4.54545455 4.64646465 4.74747475
4.84848485 4.94949495 5.05050505 5.15151515 5.25252525 5.35353535
5.45454545 5.55555556 5.65656566 5.75757576 5.85858586 5.95959596
6.06060606 6.16161616 6.26262626 6.36363636 6.46464646 6.56565657
6.66666667 6.76767677 6.86868687 6.96969697 7.07070707 7.17171717
7.27272727 7.37373737 7.47474747 7.57575758 7.67676768 7.77777778
7.87878788 7.97979798 8.08080808 8.18181818 8.28282828 8.38383838
8.48484848 8.58585859 8.68686869 8.78787879 8.88888889 8.98989899
9.09090909 9.19191919 9.29292929 9.39393939 9.49494949 9.5959596
9.6969697 9.7979798 9.8989899 10. ]
** Nota:** Lo siguiente se ejecuta todo junto
plt.plot(x, np.sin(x))
plt.plot(x, np.cos(x))
open_close_plot() # NO es necesario ponerlo en Jupiter o en Spider, solo en RStudio
plt.figure() # crea la figura
# Crea el primer panel
plt.subplot(2, 1, 1) # (filas, columnas, número de paneles)
plt.plot(x, np.sin(x))
# crea el segundo panel
plt.subplot(2, 1, 2)
plt.plot(x, np.cos(x))
open_close_plot()
** Un estilo Orientado a Objetos para situaciones más complejas**
fig, ax = plt.subplots(2)
# Llama el método plot() method
ax[0].plot(x, np.sin(x))
ax[1].plot(x, np.cos(x))
open_close_plot()
** Otro ejemplo Orientado a Objetos**
fig = plt.figure()
ax = plt.axes()
x = np.linspace(0, 10, 1000)
ax.plot(x, np.sin(x))
open_close_plot()
** Estilo funcional**
x = np.linspace(0, 10, 1000)
plt.plot(x, np.sin(x))
open_close_plot()
plt.plot(x, np.sin(x - 0), color='blue') # Nombre del color
plt.plot(x, np.sin(x - 1), color='g') # Código del color (rgbcmyk)
plt.plot(x, np.sin(x - 2), color='0.75') # escala de gris entre 0 y 1
plt.plot(x, np.sin(x - 3), color='#FFDD44') # Código exadecimal (RRGGBB from 00 to FF)
plt.plot(x, np.sin(x - 4), color=(1.0,0.2,0.3)) # Tupla RGB entre 0 y 1
plt.plot(x, np.sin(x - 5), color='chartreuse') # Nombres de color en HTML
open_close_plot()
plt.plot(x, x + 0, linestyle='solid')
plt.plot(x, x + 1, linestyle='dashed')
plt.plot(x, x + 2, linestyle='dashdot')
plt.plot(x, x + 3, linestyle='dotted')
open_close_plot()
# Lo mismo pero con código
plt.plot(x, x + 4, linestyle='-')
plt.plot(x, x + 5, linestyle='--')
plt.plot(x, x + 6, linestyle='-.')
plt.plot(x, x + 7, linestyle=':')
open_close_plot()
** Cambiando los límites de los ejes**
plt.plot(x, np.sin(x))
plt.xlim(-1, 11)
plt.ylim(-1.5, 1.5)
open_close_plot()
** Otro ejemplo**
plt.plot(x, np.sin(x))
plt.axis('tight')
open_close_plot()
plt.plot(x, np.sin(x))
plt.title("Función Seno(x)")
plt.xlabel("x")
plt.ylabel("Seno(x)")
open_close_plot()
plt.plot(x, np.sin(x), '-g', label='Seno(x)')
plt.plot(x, np.cos(x), ':b', label='Coseno(x)')
plt.axis('equal')
plt.legend()
open_close_plot()
** Orientado a Objetos**
ax = plt.axes()
ax.plot(x, np.sin(x))
ax.set(xlim=(0, 10), ylim=(-2, 2),
xlabel='x', ylabel='Seno(x)',
title='Un ploteo de Seno(x)')
open_close_plot()
** A la izquierda como función de matplotlib**
** A la derecha como método del objeto ax**
plt.xlabel() → ax.set_xlabel()
plt.ylabel() → ax.set_ylabel()
plt.xlim() → ax.set_xlim()
plt.ylim() → ax.set_ylim()
plt.title() → ax.set_title()
** Ejemplo**
x = np.linspace(0, 10, 30)
y = np.sin(x)
plt.plot(x, y, 'o', color='black')
open_close_plot()
** Ejemplo**
rng = np.random.RandomState(0)
for marca in ['o', '.', ',', 'x', '+', 'v', '^', '<', '>', 's', 'd']:
plt.plot(rng.rand(5), rng.rand(5), marca,
label="marca='{0}'".format(marca))
plt.legend(numpoints=1)
plt.xlim(0, 1.8)
open_close_plot()
** Ejemplo**
plt.plot(x, y, '-ok')
open_close_plot()
** Ejemplo**
plt.plot(x, y, '-p', color='gray',
markersize=15, linewidth=4,
markerfacecolor='white',
markeredgecolor='gray',
markeredgewidth=2)
plt.ylim(-1.2, 1.2)
open_close_plot()
** Comando scatter, más potente** ** Ejemplo**
plt.scatter(x, y, marker='o')
open_close_plot()
** Ejemplo**
rng = np.random.RandomState(0)
x = rng.randn(100)
y = rng.randn(100)
colores = rng.rand(100)
tamanos = 1000 * rng.rand(100)
plt.scatter(x, y, c=colores, s=tamanos, alpha=0.3,cmap='viridis')
plt.colorbar()
open_close_plot()
** Ejemplo**
from sklearn.datasets import load_iris
iris = load_iris()
print(iris)
{'data': array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2],
[5.4, 3.9, 1.7, 0.4],
[4.6, 3.4, 1.4, 0.3],
[5. , 3.4, 1.5, 0.2],
[4.4, 2.9, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.1],
[5.4, 3.7, 1.5, 0.2],
[4.8, 3.4, 1.6, 0.2],
[4.8, 3. , 1.4, 0.1],
[4.3, 3. , 1.1, 0.1],
[5.8, 4. , 1.2, 0.2],
[5.7, 4.4, 1.5, 0.4],
[5.4, 3.9, 1.3, 0.4],
[5.1, 3.5, 1.4, 0.3],
[5.7, 3.8, 1.7, 0.3],
[5.1, 3.8, 1.5, 0.3],
[5.4, 3.4, 1.7, 0.2],
[5.1, 3.7, 1.5, 0.4],
[4.6, 3.6, 1. , 0.2],
[5.1, 3.3, 1.7, 0.5],
[4.8, 3.4, 1.9, 0.2],
[5. , 3. , 1.6, 0.2],
[5. , 3.4, 1.6, 0.4],
[5.2, 3.5, 1.5, 0.2],
[5.2, 3.4, 1.4, 0.2],
[4.7, 3.2, 1.6, 0.2],
[4.8, 3.1, 1.6, 0.2],
[5.4, 3.4, 1.5, 0.4],
[5.2, 4.1, 1.5, 0.1],
[5.5, 4.2, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.2],
[5. , 3.2, 1.2, 0.2],
[5.5, 3.5, 1.3, 0.2],
[4.9, 3.6, 1.4, 0.1],
[4.4, 3. , 1.3, 0.2],
[5.1, 3.4, 1.5, 0.2],
[5. , 3.5, 1.3, 0.3],
[4.5, 2.3, 1.3, 0.3],
[4.4, 3.2, 1.3, 0.2],
[5. , 3.5, 1.6, 0.6],
[5.1, 3.8, 1.9, 0.4],
[4.8, 3. , 1.4, 0.3],
[5.1, 3.8, 1.6, 0.2],
[4.6, 3.2, 1.4, 0.2],
[5.3, 3.7, 1.5, 0.2],
[5. , 3.3, 1.4, 0.2],
[7. , 3.2, 4.7, 1.4],
[6.4, 3.2, 4.5, 1.5],
[6.9, 3.1, 4.9, 1.5],
[5.5, 2.3, 4. , 1.3],
[6.5, 2.8, 4.6, 1.5],
[5.7, 2.8, 4.5, 1.3],
[6.3, 3.3, 4.7, 1.6],
[4.9, 2.4, 3.3, 1. ],
[6.6, 2.9, 4.6, 1.3],
[5.2, 2.7, 3.9, 1.4],
[5. , 2. , 3.5, 1. ],
[5.9, 3. , 4.2, 1.5],
[6. , 2.2, 4. , 1. ],
[6.1, 2.9, 4.7, 1.4],
[5.6, 2.9, 3.6, 1.3],
[6.7, 3.1, 4.4, 1.4],
[5.6, 3. , 4.5, 1.5],
[5.8, 2.7, 4.1, 1. ],
[6.2, 2.2, 4.5, 1.5],
[5.6, 2.5, 3.9, 1.1],
[5.9, 3.2, 4.8, 1.8],
[6.1, 2.8, 4. , 1.3],
[6.3, 2.5, 4.9, 1.5],
[6.1, 2.8, 4.7, 1.2],
[6.4, 2.9, 4.3, 1.3],
[6.6, 3. , 4.4, 1.4],
[6.8, 2.8, 4.8, 1.4],
[6.7, 3. , 5. , 1.7],
[6. , 2.9, 4.5, 1.5],
[5.7, 2.6, 3.5, 1. ],
[5.5, 2.4, 3.8, 1.1],
[5.5, 2.4, 3.7, 1. ],
[5.8, 2.7, 3.9, 1.2],
[6. , 2.7, 5.1, 1.6],
[5.4, 3. , 4.5, 1.5],
[6. , 3.4, 4.5, 1.6],
[6.7, 3.1, 4.7, 1.5],
[6.3, 2.3, 4.4, 1.3],
[5.6, 3. , 4.1, 1.3],
[5.5, 2.5, 4. , 1.3],
[5.5, 2.6, 4.4, 1.2],
[6.1, 3. , 4.6, 1.4],
[5.8, 2.6, 4. , 1.2],
[5. , 2.3, 3.3, 1. ],
[5.6, 2.7, 4.2, 1.3],
[5.7, 3. , 4.2, 1.2],
[5.7, 2.9, 4.2, 1.3],
[6.2, 2.9, 4.3, 1.3],
[5.1, 2.5, 3. , 1.1],
[5.7, 2.8, 4.1, 1.3],
[6.3, 3.3, 6. , 2.5],
[5.8, 2.7, 5.1, 1.9],
[7.1, 3. , 5.9, 2.1],
[6.3, 2.9, 5.6, 1.8],
[6.5, 3. , 5.8, 2.2],
[7.6, 3. , 6.6, 2.1],
[4.9, 2.5, 4.5, 1.7],
[7.3, 2.9, 6.3, 1.8],
[6.7, 2.5, 5.8, 1.8],
[7.2, 3.6, 6.1, 2.5],
[6.5, 3.2, 5.1, 2. ],
[6.4, 2.7, 5.3, 1.9],
[6.8, 3. , 5.5, 2.1],
[5.7, 2.5, 5. , 2. ],
[5.8, 2.8, 5.1, 2.4],
[6.4, 3.2, 5.3, 2.3],
[6.5, 3. , 5.5, 1.8],
[7.7, 3.8, 6.7, 2.2],
[7.7, 2.6, 6.9, 2.3],
[6. , 2.2, 5. , 1.5],
[6.9, 3.2, 5.7, 2.3],
[5.6, 2.8, 4.9, 2. ],
[7.7, 2.8, 6.7, 2. ],
[6.3, 2.7, 4.9, 1.8],
[6.7, 3.3, 5.7, 2.1],
[7.2, 3.2, 6. , 1.8],
[6.2, 2.8, 4.8, 1.8],
[6.1, 3. , 4.9, 1.8],
[6.4, 2.8, 5.6, 2.1],
[7.2, 3. , 5.8, 1.6],
[7.4, 2.8, 6.1, 1.9],
[7.9, 3.8, 6.4, 2. ],
[6.4, 2.8, 5.6, 2.2],
[6.3, 2.8, 5.1, 1.5],
[6.1, 2.6, 5.6, 1.4],
[7.7, 3. , 6.1, 2.3],
[6.3, 3.4, 5.6, 2.4],
[6.4, 3.1, 5.5, 1.8],
[6. , 3. , 4.8, 1.8],
[6.9, 3.1, 5.4, 2.1],
[6.7, 3.1, 5.6, 2.4],
[6.9, 3.1, 5.1, 2.3],
[5.8, 2.7, 5.1, 1.9],
[6.8, 3.2, 5.9, 2.3],
[6.7, 3.3, 5.7, 2.5],
[6.7, 3. , 5.2, 2.3],
[6.3, 2.5, 5. , 1.9],
[6.5, 3. , 5.2, 2. ],
[6.2, 3.4, 5.4, 2.3],
[5.9, 3. , 5.1, 1.8]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]), 'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'), 'DESCR': '.. _iris_dataset:\n\nIris plants dataset\n--------------------\n\n**Data Set Characteristics:**\n\n :Number of Instances: 150 (50 in each of three classes)\n :Number of Attributes: 4 numeric, predictive attributes and the class\n :Attribute Information:\n - sepal length in cm\n - sepal width in cm\n - petal length in cm\n - petal width in cm\n - class:\n - Iris-Setosa\n - Iris-Versicolour\n - Iris-Virginica\n \n :Summary Statistics:\n\n ============== ==== ==== ======= ===== ====================\n Min Max Mean SD Class Correlation\n ============== ==== ==== ======= ===== ====================\n sepal length: 4.3 7.9 5.84 0.83 0.7826\n sepal width: 2.0 4.4 3.05 0.43 -0.4194\n petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n ============== ==== ==== ======= ===== ====================\n\n :Missing Attribute Values: None\n :Class Distribution: 33.3% for each of 3 classes.\n :Creator: R.A. Fisher\n :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n :Date: July, 1988\n\nThe famous Iris database, first used by Sir R.A. Fisher. The dataset is taken\nfrom Fisher\'s paper. Note that it\'s the same as in R, but not as in the UCI\nMachine Learning Repository, which has two wrong data points.\n\nThis is perhaps the best known database to be found in the\npattern recognition literature. Fisher\'s paper is a classic in the field and\nis referenced frequently to this day. (See Duda & Hart, for example.) The\ndata set contains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant. One class is linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\n.. topic:: References\n\n - Fisher, R.A. "The use of multiple measurements in taxonomic problems"\n Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n Mathematical Statistics" (John Wiley, NY, 1950).\n - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.\n (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n Structure and Classification Rule for Recognition in Partially Exposed\n Environments". IEEE Transactions on Pattern Analysis and Machine\n Intelligence, Vol. PAMI-2, No. 1, 67-71.\n - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions\n on Information Theory, May 1972, 431-433.\n - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II\n conceptual clustering system finds 3 classes in the data.\n - Many, many more ...', 'feature_names': ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], 'filename': '/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/sklearn/datasets/data/iris.csv'}
caracteristicas = iris.data.T
print(caracteristicas)
[[5.1 4.9 4.7 4.6 5. 5.4 4.6 5. 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1
5.7 5.1 5.4 5.1 4.6 5.1 4.8 5. 5. 5.2 5.2 4.7 4.8 5.4 5.2 5.5 4.9 5.
5.5 4.9 4.4 5.1 5. 4.5 4.4 5. 5.1 4.8 5.1 4.6 5.3 5. 7. 6.4 6.9 5.5
6.5 5.7 6.3 4.9 6.6 5.2 5. 5.9 6. 6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1
6.3 6.1 6.4 6.6 6.8 6.7 6. 5.7 5.5 5.5 5.8 6. 5.4 6. 6.7 6.3 5.6 5.5
5.5 6.1 5.8 5. 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3
6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6. 6.9 5.6 7.7 6.3 6.7 7.2
6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6. 6.9 6.7 6.9 5.8 6.8
6.7 6.7 6.3 6.5 6.2 5.9]
[3.5 3. 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 3.7 3.4 3. 3. 4. 4.4 3.9 3.5
3.8 3.8 3.4 3.7 3.6 3.3 3.4 3. 3.4 3.5 3.4 3.2 3.1 3.4 4.1 4.2 3.1 3.2
3.5 3.6 3. 3.4 3.5 2.3 3.2 3.5 3.8 3. 3.8 3.2 3.7 3.3 3.2 3.2 3.1 2.3
2.8 2.8 3.3 2.4 2.9 2.7 2. 3. 2.2 2.9 2.9 3.1 3. 2.7 2.2 2.5 3.2 2.8
2.5 2.8 2.9 3. 2.8 3. 2.9 2.6 2.4 2.4 2.7 2.7 3. 3.4 3.1 2.3 3. 2.5
2.6 3. 2.6 2.3 2.7 3. 2.9 2.9 2.5 2.8 3.3 2.7 3. 2.9 3. 3. 2.5 2.9
2.5 3.6 3.2 2.7 3. 2.5 2.8 3.2 3. 3.8 2.6 2.2 3.2 2.8 2.8 2.7 3.3 3.2
2.8 3. 2.8 3. 2.8 3.8 2.8 2.8 2.6 3. 3.4 3.1 3. 3.1 3.1 3.1 2.7 3.2
3.3 3. 2.5 3. 3.4 3. ]
[1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 1.5 1.6 1.4 1.1 1.2 1.5 1.3 1.4
1.7 1.5 1.7 1.5 1. 1.7 1.9 1.6 1.6 1.5 1.4 1.6 1.6 1.5 1.5 1.4 1.5 1.2
1.3 1.4 1.3 1.5 1.3 1.3 1.3 1.6 1.9 1.4 1.6 1.4 1.5 1.4 4.7 4.5 4.9 4.
4.6 4.5 4.7 3.3 4.6 3.9 3.5 4.2 4. 4.7 3.6 4.4 4.5 4.1 4.5 3.9 4.8 4.
4.9 4.7 4.3 4.4 4.8 5. 4.5 3.5 3.8 3.7 3.9 5.1 4.5 4.5 4.7 4.4 4.1 4.
4.4 4.6 4. 3.3 4.2 4.2 4.2 4.3 3. 4.1 6. 5.1 5.9 5.6 5.8 6.6 4.5 6.3
5.8 6.1 5.1 5.3 5.5 5. 5.1 5.3 5.5 6.7 6.9 5. 5.7 4.9 6.7 4.9 5.7 6.
4.8 4.9 5.6 5.8 6.1 6.4 5.6 5.1 5.6 6.1 5.6 5.5 4.8 5.4 5.6 5.1 5.1 5.9
5.7 5.2 5. 5.2 5.4 5.1]
[0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 0.2 0.2 0.1 0.1 0.2 0.4 0.4 0.3
0.3 0.3 0.2 0.4 0.2 0.5 0.2 0.2 0.4 0.2 0.2 0.2 0.2 0.4 0.1 0.2 0.2 0.2
0.2 0.1 0.2 0.2 0.3 0.3 0.2 0.6 0.4 0.3 0.2 0.2 0.2 0.2 1.4 1.5 1.5 1.3
1.5 1.3 1.6 1. 1.3 1.4 1. 1.5 1. 1.4 1.3 1.4 1.5 1. 1.5 1.1 1.8 1.3
1.5 1.2 1.3 1.4 1.4 1.7 1.5 1. 1.1 1. 1.2 1.6 1.5 1.6 1.5 1.3 1.3 1.3
1.2 1.4 1.2 1. 1.3 1.2 1.3 1.3 1.1 1.3 2.5 1.9 2.1 1.8 2.2 2.1 1.7 1.8
1.8 2.5 2. 1.9 2.1 2. 2.4 2.3 1.8 2.2 2.3 1.5 2.3 2. 2. 1.8 2.1 1.8
1.8 1.8 2.1 1.6 1.9 2. 2.2 1.5 1.4 2.3 2.4 1.8 1.8 2.1 2.4 2.3 1.9 2.3
2.5 2.3 1.9 2. 2.3 1.8]]
plt.scatter(caracteristicas[0], caracteristicas[1],
alpha=0.2,s=100*caracteristicas[3], c=iris.target, cmap='viridis')
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
open_close_plot()
plt.style.use('seaborn-white')
datos = np.random.randn(1000)
plt.hist(datos)
open_close_plot()
plt.hist(datos, bins=30, normed=True, alpha=0.5,
histtype='stepfilled', color='steelblue',
edgecolor='none')
/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/matplotlib/axes/_axes.py:6571: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
warnings.warn("The 'normed' kwarg is deprecated, and has been "
open_close_plot()
import scipy.stats
** Test de Shapiro-Wilk**
shapiro_resultados = scipy.stats.shapiro(datos)
print(shapiro_resultados)
(0.9983751177787781, 0.4758008122444153)
Test_Estadistico = shapiro_resultados[0]
print(Test_Estadistico)
0.9983751177787781
p_value = shapiro_resultados[1]
print(p_value)
0.4758008122444153
print(p_value < Test_Estadistico)
# Da True significa que los datos siguen la normal
True
** Test de Kolmogorov-Smirnov**
ks_resultados = scipy.stats.kstest(datos, cdf='norm')
print(ks_resultados)
KstestResult(statistic=0.03580670649015916, pvalue=0.15029848410508806)
Test_Estadistico = ks_resultados[0]
print(Test_Estadistico)
0.03580670649015916
p_value = ks_resultados[1]
print(p_value)
0.15029848410508806
print(p_value < Test_Estadistico)
# Da False significa que los datos no siguen la normal
False
** Ejemplo**
datos = np.random.normal(0, 0.8, 1000)
plt.hist(datos)
plt.hist(datos, bins=30, normed=True, alpha=0.5,
histtype='stepfilled', color='steelblue',
edgecolor='none')
open_close_plot()
** Test de Shapiro-Wilk**
shapiro_resultados = scipy.stats.shapiro(datos)
print(shapiro_resultados)
(0.9992647767066956, 0.9714964628219604)
Test_Estadistico = shapiro_resultados[0]
print(Test_Estadistico)
0.9992647767066956
p_value = shapiro_resultados[1]
print(p_value)
0.9714964628219604
print(p_value < Test_Estadistico)
True
** Test de Kolmogorov-Smirnov**
ks_resultados = scipy.stats.kstest(datos, cdf='norm')
print(ks_resultados)
KstestResult(statistic=0.052690249452009355, pvalue=0.007471001775292363)
Test_Estadistico = ks_resultados[0]
print(Test_Estadistico)
0.052690249452009355
p_value = ks_resultados[1]
print(p_value)
0.007471001775292363
print(p_value < Test_Estadistico)
True
** Ejemplo**
x1 = np.random.normal(0, 0.8, 1000)
x2 = np.random.normal(-2, 1, 1000)
x3 = np.random.normal(3, 2, 1000)
estilo = dict(histtype='stepfilled', alpha=0.3, normed=True, bins=40)
plt.hist(x1, ** estilo)
open_close_plot()
plt.hist(x2, ** estilo)
open_close_plot()
plt.hist(x3, ** estilo)
open_close_plot()
** Ejemplo**
x = np.linspace(0, 10, 1000)
fig, ax = plt.subplots()
ax.plot(x, np.sin(x), '-b', label='Seno')
ax.plot(x, np.cos(x), '--r', label='Coseno')
ax.axis('equal')
leg = ax.legend()
plt.show()
** Ejemplo**
Se cambia un atributo al objeto fig y se plotea de nuevo
ax.legend(frameon=False, loc='lower center', ncol=2)
plt.show()
** Ejemplo**
Se cambia un atributo al objeto fig y se plotea de nuevo
ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1)
open_close_plot()
** Ejemplo**
ax1 = plt.axes() # Ejes
ax2 = plt.axes([0.65, 0.65, 0.2, 0.2])
open_close_plot()
** Ejemplo**
fig = plt.figure()
ax1 = fig.add_axes([0.1, 0.5, 0.8, 0.4],
xticklabels=[], ylim=(-1.2, 1.2))
ax2 = fig.add_axes([0.1, 0.1, 0.8, 0.4],
ylim=(-1.2, 1.2))
x = np.linspace(0, 10)
ax1.plot(np.sin(x))
ax2.plot(np.cos(x))
open_close_plot()
** Ejemplo**
for i in range(1, 7):
plt.subplot(2, 3, i)
plt.text(0.5, 0.5, str((2, 3, i)),fontsize=18, ha='center')
open_close_plot()
** Ejemplo**
fig, ax = plt.subplots(2, 3, sharex='col', sharey='row')
# Los ejes son arreglos bidimensionales [i, j]
for i in range(2):
for j in range(3):
ax[i, j].text(0.5, 0.5, str((i, j)),
fontsize=18, ha='center')
open_close_plot()
** Ejemplo Dígitos, los vamos a usar más adelante en el curso**
from sklearn.datasets import load_digits
digitos = load_digits(n_class=6)
print(digitos)
{'data': array([[ 0., 0., 5., ..., 0., 0., 0.],
[ 0., 0., 0., ..., 10., 0., 0.],
[ 0., 0., 0., ..., 16., 9., 0.],
...,
[ 0., 0., 0., ..., 9., 0., 0.],
[ 0., 0., 0., ..., 4., 0., 0.],
[ 0., 0., 6., ..., 6., 0., 0.]]), 'target': array([0, 1, 2, ..., 4, 4, 0]), 'target_names': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 'images': array([[[ 0., 0., 5., ..., 1., 0., 0.],
[ 0., 0., 13., ..., 15., 5., 0.],
[ 0., 3., 15., ..., 11., 8., 0.],
...,
[ 0., 4., 11., ..., 12., 7., 0.],
[ 0., 2., 14., ..., 12., 0., 0.],
[ 0., 0., 6., ..., 0., 0., 0.]],
[[ 0., 0., 0., ..., 5., 0., 0.],
[ 0., 0., 0., ..., 9., 0., 0.],
[ 0., 0., 3., ..., 6., 0., 0.],
...,
[ 0., 0., 1., ..., 6., 0., 0.],
[ 0., 0., 1., ..., 6., 0., 0.],
[ 0., 0., 0., ..., 10., 0., 0.]],
[[ 0., 0., 0., ..., 12., 0., 0.],
[ 0., 0., 3., ..., 14., 0., 0.],
[ 0., 0., 8., ..., 16., 0., 0.],
...,
[ 0., 9., 16., ..., 0., 0., 0.],
[ 0., 3., 13., ..., 11., 5., 0.],
[ 0., 0., 0., ..., 16., 9., 0.]],
...,
[[ 0., 0., 0., ..., 6., 0., 0.],
[ 0., 0., 0., ..., 2., 0., 0.],
[ 0., 0., 8., ..., 1., 2., 0.],
...,
[ 0., 12., 16., ..., 16., 1., 0.],
[ 0., 1., 7., ..., 13., 0., 0.],
[ 0., 0., 0., ..., 9., 0., 0.]],
[[ 0., 0., 0., ..., 4., 0., 0.],
[ 0., 0., 4., ..., 0., 0., 0.],
[ 0., 0., 12., ..., 4., 3., 0.],
...,
[ 0., 12., 16., ..., 13., 0., 0.],
[ 0., 0., 4., ..., 8., 0., 0.],
[ 0., 0., 0., ..., 4., 0., 0.]],
[[ 0., 0., 6., ..., 11., 1., 0.],
[ 0., 0., 16., ..., 16., 1., 0.],
[ 0., 3., 16., ..., 13., 6., 0.],
...,
[ 0., 5., 16., ..., 16., 5., 0.],
[ 0., 1., 15., ..., 16., 1., 0.],
[ 0., 0., 6., ..., 6., 0., 0.]]]), 'DESCR': ".. _digits_dataset:\n\nOptical recognition of handwritten digits dataset\n--------------------------------------------------\n\n**Data Set Characteristics:**\n\n :Number of Instances: 5620\n :Number of Attributes: 64\n :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n :Missing Attribute Values: None\n :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n :Date: July; 1998\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttp://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nThe data set contains images of hand-written digits: 10 classes where\neach class refers to a digit.\n\nPreprocessing programs made available by NIST were used to extract\nnormalized bitmaps of handwritten digits from a preprinted form. From a\ntotal of 43 people, 30 contributed to the training set and different 13\nto the test set. 32x32 bitmaps are divided into nonoverlapping blocks of\n4x4 and the number of on pixels are counted in each block. This generates\nan input matrix of 8x8 where each element is an integer in the range\n0..16. This reduces dimensionality and gives invariance to small\ndistortions.\n\nFor info on NIST preprocessing routines, see M. D. Garris, J. L. Blue, G.\nT. Candela, D. L. Dimmick, J. Geist, P. J. Grother, S. A. Janet, and C.\nL. Wilson, NIST Form-Based Handprint Recognition System, NISTIR 5469,\n1994.\n\n.. topic:: References\n\n - C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their\n Applications to Handwritten Digit Recognition, MSc Thesis, Institute of\n Graduate Studies in Science and Engineering, Bogazici University.\n - E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika.\n - Ken Tang and Ponnuthurai N. Suganthan and Xi Yao and A. Kai Qin.\n Linear dimensionalityreduction using relevance weighted LDA. School of\n Electrical and Electronic Engineering Nanyang Technological University.\n 2005.\n - Claudio Gentile. A New Approximate Maximal Margin Classification\n Algorithm. NIPS. 2000."}
fig, ax = plt.subplots(8, 8, figsize=(6, 6))
for i, axi in enumerate(ax.flat):
axi.imshow(digitos.images[i], cmap='binary')
axi.set(xticks=[], yticks=[])
open_close_plot()
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D # Para una diferente version de matplotlib
** Ejemplo**
fig = plt.figure()
ax = Axes3D(fig)
open_close_plot()
** Ejemplo**
fig = plt.figure()
ax = Axes3D(fig)
# Datos para la línea en 3D
zline = np.linspace(0, 15, 1000)
xline = np.sin(zline)
yline = np.cos(zline)
ax.plot3D(xline, yline, zline, 'gray')
open_close_plot()
** Datos para los puntos**
fig = plt.figure()
ax = Axes3D(fig)
zdata = 15 * np.random.random(100)
xdata = np.sin(zdata) + 0.1 * np.random.randn(100)
ydata = np.cos(zdata) + 0.1 * np.random.randn(100)
ax.scatter3D(xdata, ydata, zdata, c=zdata, cmap='Greens')
open_close_plot()
** El paquete Seaborn**
import seaborn as sns
iris = sns.load_dataset("iris")
print(iris.head())
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
corr = sns.pairplot(iris, hue='species', size=2.5)
/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/seaborn/axisgrid.py:2065: UserWarning: The `size` parameter has been renamed to `height`; pleaes update your code.
warnings.warn(msg, UserWarning)
/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
open_close_plot()
import pandas as pd
import prince
import os
import pandas as pd
import numpy as np
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
print(os.getcwd())
/Users/oldemarrodriguez/Google Drive/MDCurso/Datos
datos = pd.read_csv('SAheart.csv',delimiter=';',decimal=".")
print(datos.head())
sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
0 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 Si
1 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 Si
2 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 No
3 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 Si
4 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 Si
print(datos.shape)
(462, 10)
** describe() es como el summary de R para las variables numéricas**
print(datos.dropna().describe())
sbp tobacco ... alcohol age
count 462.000000 462.000000 ... 462.000000 462.000000
mean 138.326840 3.635649 ... 17.044394 42.816017
std 20.496317 4.593024 ... 24.481059 14.608956
min 101.000000 0.000000 ... 0.000000 15.000000
25% 124.000000 0.052500 ... 0.510000 31.000000
50% 134.000000 2.000000 ... 7.510000 45.000000
75% 148.000000 5.500000 ... 23.892500 55.000000
max 218.000000 31.200000 ... 147.190000 64.000000
[8 rows x 8 columns]
print(datos.describe())
sbp tobacco ... alcohol age
count 462.000000 462.000000 ... 462.000000 462.000000
mean 138.326840 3.635649 ... 17.044394 42.816017
std 20.496317 4.593024 ... 24.481059 14.608956
min 101.000000 0.000000 ... 0.000000 15.000000
25% 124.000000 0.052500 ... 0.510000 31.000000
50% 134.000000 2.000000 ... 7.510000 45.000000
75% 148.000000 5.500000 ... 23.892500 55.000000
max 218.000000 31.200000 ... 147.190000 64.000000
[8 rows x 8 columns]
print(datos.mean(numeric_only=True))
sbp 138.326840
tobacco 3.635649
ldl 4.740325
adiposity 25.406732
typea 53.103896
obesity 26.044113
alcohol 17.044394
age 42.816017
dtype: float64
print(datos.median(numeric_only=True))
sbp 134.000
tobacco 2.000
ldl 4.340
adiposity 26.115
typea 53.000
obesity 25.805
alcohol 7.510
age 45.000
dtype: float64
print(datos.std(numeric_only=True))
sbp 20.496317
tobacco 4.593024
ldl 2.070909
adiposity 7.780699
typea 9.817534
obesity 4.213680
alcohol 24.481059
age 14.608956
dtype: float64
print(datos.max(numeric_only=True))
sbp 218.00
tobacco 31.20
ldl 15.33
adiposity 42.49
typea 78.00
obesity 46.58
alcohol 147.19
age 64.00
dtype: float64
** Los percentiles**
print(datos.quantile(np.array([0,.25,.50,.75,1])))
sbp tobacco ldl adiposity typea obesity alcohol age
0.00 101.0 0.0000 0.9800 6.7400 13.0 14.7000 0.0000 15.0
0.25 124.0 0.0525 3.2825 19.7750 47.0 22.9850 0.5100 31.0
0.50 134.0 2.0000 4.3400 26.1150 53.0 25.8050 7.5100 45.0
0.75 148.0 5.5000 5.7900 31.2275 60.0 28.4975 23.8925 55.0
1.00 218.0 31.2000 15.3300 42.4900 78.0 46.5800 147.1900 64.0
** Contando datos en las variables categóricas**
print(pd.crosstab(index=datos["chd"],columns="count"))
col_0 count
chd
No 302
Si 160
print(pd.crosstab(index=datos["famhist"],columns="count"))
col_0 count
famhist
Absent 270
Present 192
** Otra forma**
print(datos['chd'].value_counts())
No 302
Si 160
Name: chd, dtype: int64
print(datos["famhist"].value_counts())
Absent 270
Present 192
Name: famhist, dtype: int64
** Tabla cruzada**
famhist_chd = pd.crosstab(index=datos["famhist"], columns=datos["chd"])
print(famhist_chd)
chd No Si
famhist
Absent 206 64
Present 96 96
famhist_chd.index = ["Absent","Present"]
print(famhist_chd)
chd No Si
Absent 206 64
Present 96 96
** Otra forma**
g_chd = pd.crosstab(index=datos["chd"],columns="count")
print(g_chd)
col_0 count
chd
No 302
Si 160
print(g_chd['count'][0])
302
print(g_chd['count'][1])
160
g_famhist = pd.crosstab(index=datos["famhist"],columns="count")
print(g_famhist)
col_0 count
famhist
Absent 270
Present 192
print(g_famhist['count'][0])
270
print(g_famhist['count'][1])
192
** Gráfico chd**
import matplotlib.pyplot as plt
alto = [g_chd['count'][0], g_chd['count'][1]]
barras = ('No', 'Sí')
y_pos = np.arange(len(barras))
plt.bar(y_pos, alto, color=['red','blue'])
plt.xticks(y_pos, barras)
open_close_plot()
** Gráfico famhist**
alto = [g_famhist['count'][0], g_famhist['count'][1]]
barras = ('Absent ', 'Present')
y_pos = np.arange(len(barras))
plt.bar(y_pos, alto, color=['red','blue'])
plt.xticks(y_pos, barras)
open_close_plot()
** Box Plots**
datos.head()
boxplots = datos.boxplot(return_type='axes')
open_close_plot()
** Función de densidad**
densidad = datos[datos.columns[:1]].plot(kind='density')
open_close_plot()
densidad = datos[datos.columns[8:9]].plot(kind='density')
open_close_plot()
densidad = datos['age'].plot(kind='density')
open_close_plot()
densidad = datos[datos.columns[:10]].plot(kind='density')
open_close_plot()
** Histogramas**
densidad = datos[datos.columns[:1]].plot(kind='hist')
open_close_plot()
densidad = datos[datos.columns[8:9]].plot(kind='hist')
open_close_plot()
densidad = datos['age'].plot(kind='hist')
open_close_plot()
densidad = datos[datos.columns[:10]].plot(kind='hist')
open_close_plot()
import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(datos, hue='chd', size=2.5)
/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/seaborn/axisgrid.py:2065: UserWarning: The `size` parameter has been renamed to `height`; pleaes update your code.
warnings.warn(msg, UserWarning)
open_close_plot()
sns.pairplot(datos, hue='famhist', size=2.5)
/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/seaborn/axisgrid.py:2065: UserWarning: The `size` parameter has been renamed to `height`; pleaes update your code.
warnings.warn(msg, UserWarning)
open_close_plot()
** Nota:** Es “inteligente” e ingnora las variables categóricas
corr = datos.corr()
print(corr)
sbp tobacco ldl ... obesity alcohol age
sbp 1.000000 0.212247 0.158296 ... 0.238067 0.140096 0.388771
tobacco 0.212247 1.000000 0.158905 ... 0.124529 0.200813 0.450330
ldl 0.158296 0.158905 1.000000 ... 0.330506 -0.033403 0.311799
adiposity 0.356500 0.286640 0.440432 ... 0.716556 0.100330 0.625954
typea -0.057454 -0.014608 0.044048 ... 0.074006 0.039498 -0.102606
obesity 0.238067 0.124529 0.330506 ... 1.000000 0.051620 0.291777
alcohol 0.140096 0.200813 -0.033403 ... 0.051620 1.000000 0.101125
age 0.388771 0.450330 0.311799 ... 0.291777 0.101125 1.000000
[8 rows x 8 columns]
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
square=True, ax=ax)
open_close_plot()
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
#print(os.getcwd())
datos = pd.read_csv('SAheart.csv',delimiter=';',decimal=".")
print(datos.head())
sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
0 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 Si
1 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 Si
2 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 No
3 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 Si
4 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 Si
** Conviertiendo una categoría en números**
print(pd.value_counts(datos["chd"]))
No 302
Si 160
Name: chd, dtype: int64
** Equivalente**
print(datos['chd'].value_counts())
No 302
Si 160
Name: chd, dtype: int64
** La siguiente función recodifica usando pandas una categoría con números**
** Nota:** Esto NO convierte la variable en numérica.
def recodificar(col, nuevo_codigo):
col_cod = pd.Series(col, copy=True)
for llave, valor in nuevo_codigo.items():
col_cod.replace(llave, valor, inplace=True)
return col_cod
datos["chd"] = recodificar(datos["chd"], {'No':0,'Si':1})
print(datos.head())
sbp tobacco ldl adiposity ... obesity alcohol age chd
0 160 12.00 5.73 23.11 ... 25.30 97.20 52 1
1 144 0.01 4.41 28.61 ... 28.87 2.06 63 1
2 118 0.08 3.48 32.28 ... 29.14 3.81 46 0
3 170 7.50 6.41 38.03 ... 31.99 24.26 58 1
4 134 13.60 3.50 27.78 ... 25.99 57.34 49 1
[5 rows x 10 columns]
** Luego de recoficar**
print(pd.value_counts(datos["chd"]))
0 302
1 160
Name: chd, dtype: int64
** Equivalente**
print(datos['chd'].value_counts())
0 302
1 160
Name: chd, dtype: int64
** A la inversa: Conviertiendo un número en una categoría**
datos["chd"] = recodificar(datos["chd"], {0:'No',1:'Si'})
print(datos.head())
sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
0 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 Si
1 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 Si
2 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 No
3 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 Si
4 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 Si
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import load_digits
digits = load_digits()
print(digits.data.shape)
(1797, 64)
pca = PCA(2) # Reduce las dimensiones a 2
componentes = pca.fit_transform(digits.data)
print(digits.data.shape)
(1797, 64)
print(componentes.shape)
(1797, 2)
plt.scatter(componentes[:, 0], componentes[:, 1],
c=digits.target, edgecolor='none', alpha=0.5,
cmap=plt.cm.get_cmap('viridis', 10))
plt.xlabel('componente 1')
plt.ylabel('componente 2')
plt.colorbar()
open_close_plot()
import os
import pandas as pd
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
#print(os.getcwd())
datos = pd.read_csv('EjemploEstudiantes.csv',delimiter=';',decimal=",",index_col=0)
print(datos)
Matematicas Ciencias Espanol Historia EdFisica
Lucia 7.0 6.5 9.2 8.6 8.0
Pedro 7.5 9.4 7.3 7.0 7.0
Ines 7.6 9.2 8.0 8.0 7.5
Luis 5.0 6.5 6.5 7.0 9.0
Andres 6.0 6.0 7.8 8.9 7.3
Ana 7.8 9.6 7.7 8.0 6.5
Carlos 6.3 6.4 8.2 9.0 7.2
Jose 7.9 9.7 7.5 8.0 6.0
Sonia 6.0 6.0 6.5 5.5 8.7
Maria 6.8 7.2 8.7 9.0 7.0
print(datos.head())
Matematicas Ciencias Espanol Historia EdFisica
Lucia 7.0 6.5 9.2 8.6 8.0
Pedro 7.5 9.4 7.3 7.0 7.0
Ines 7.6 9.2 8.0 8.0 7.5
Luis 5.0 6.5 6.5 7.0 9.0
Andres 6.0 6.0 7.8 8.9 7.3
print(datos.shape)
(10, 5)
pca = PCA(n_components=2)
componentes = pca.fit_transform(datos)
print(componentes)
[[-0.76471745 -1.5817637 ]
[ 1.66887794 1.39196556]
[ 1.57822841 0.29949595]
[-2.60701317 1.32020402]
[-1.43877557 -1.33566867]
[ 2.34790534 0.3880845 ]
[-0.89372557 -1.51890124]
[ 2.64984571 0.4254636 ]
[-2.62959083 2.18339513]
[ 0.08896518 -1.57227516]]
print(datos.shape)
(10, 5)
print(componentes.shape)
(10, 2)
plt.scatter(componentes[:, 0], componentes[:, 1])
plt.xlabel('componente 1')
plt.ylabel('componente 2')
open_close_plot()
** En Mac (Terminal):**
pip install git+https://github.com/MaxHalford/Prince
** En Windows (Anaconda Prompt): **
pip install git+https://github.com/MaxHalford/Prince
** Prince en githup:** + https://github.com/MaxHalford/prince
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
#print(os.getcwd())
datos = pd.read_csv('EjemploEstudiantes.csv',delimiter=';',decimal=",",index_col=0)
print(datos)
Matematicas Ciencias Espanol Historia EdFisica
Lucia 7.0 6.5 9.2 8.6 8.0
Pedro 7.5 9.4 7.3 7.0 7.0
Ines 7.6 9.2 8.0 8.0 7.5
Luis 5.0 6.5 6.5 7.0 9.0
Andres 6.0 6.0 7.8 8.9 7.3
Ana 7.8 9.6 7.7 8.0 6.5
Carlos 6.3 6.4 8.2 9.0 7.2
Jose 7.9 9.7 7.5 8.0 6.0
Sonia 6.0 6.0 6.5 5.5 8.7
Maria 6.8 7.2 8.7 9.0 7.0
print(datos.head())
Matematicas Ciencias Espanol Historia EdFisica
Lucia 7.0 6.5 9.2 8.6 8.0
Pedro 7.5 9.4 7.3 7.0 7.0
Ines 7.6 9.2 8.0 8.0 7.5
Luis 5.0 6.5 6.5 7.0 9.0
Andres 6.0 6.0 7.8 8.9 7.3
print(datos.shape)
(10, 5)
pca = prince.PCA(n_components=5)
pca = pca.fit(datos)
# Plotea el plano principal
pca.plot_row_coordinates(datos,labels=datos.index,ellipse_fill=True)
open_close_plot()
# Despliega las Componenentes Principales
print(pca.row_coordinates(datos))
# Despliega los cosenos cuadrados
0 1 2 3 4
Lucia -0.323063 1.772525 1.198801 -0.055015 -0.003633
Pedro -0.665441 -1.638702 0.145476 -0.023065 0.123377
Ines -1.002547 -0.515692 0.628888 0.516444 -0.142876
Luis 3.172095 -0.262782 -0.381960 0.677777 0.062504
Andres 0.488868 1.365402 -0.835236 -0.155792 -0.123367
Ana -1.708633 -1.021700 -0.127077 0.066833 -0.025292
Carlos -0.067586 1.462336 -0.506240 -0.117928 -0.013124
Jose -2.011855 -1.275865 -0.542150 -0.197787 -0.017434
Sonia 3.042030 -1.254881 0.448829 -0.639999 -0.037885
Maria -0.923869 1.369359 -0.029330 -0.071467 0.177730
print(pca.row_cosine_similarities(datos))
# Despliega las correlaciones de las variables con respecto a las componentes
0 1 2 3 4
Lucia 0.022271 0.670421 0.306660 0.000646 0.000003
Pedro 0.139906 0.848431 0.006687 0.000168 0.004809
Ines 0.514469 0.136123 0.202440 0.136520 0.010449
Luis 0.936852 0.006429 0.013584 0.042771 0.000364
Andres 0.084140 0.656354 0.245604 0.008545 0.005358
Ana 0.732686 0.261980 0.004053 0.001121 0.000161
Carlos 0.001893 0.886081 0.106192 0.005763 0.000071
Jose 0.673612 0.270910 0.048917 0.006510 0.000051
Sonia 0.808830 0.137637 0.017607 0.035800 0.000125
Maria 0.308554 0.677869 0.000311 0.001846 0.011419
print(pca.column_correlations(datos))
# Valores Propios
0 1 2 3 4
Ciencias -0.722798 -0.648395 0.023840 0.235878 0.030682
EdFisica 0.913926 0.119637 0.340651 0.183154 -0.028929
Espanol -0.610893 0.717321 0.331025 -0.024542 0.045615
Historia -0.599923 0.748470 -0.232063 0.156397 -0.039644
Matematicas -0.895798 -0.345204 0.257979 -0.091468 -0.058828
print(pca.eigenvalues_)
[28.932496734179473, 16.28650424977316, 3.465960485145296, 1.2261245959725244, 0.08891393492955202]
def circulo(datos,eje1=0,eje2=1):
corr = pca.column_correlations(datos)
(fig, ax) = plt.subplots(figsize=(12, 12))
# Ejes
ax.arrow(0,0,0,1,color="b")
ax.arrow(0,0,1,0,color="b")
ax.arrow(0,0,0,-1,color="b")
ax.arrow(0,0,-1,0,color="b")
for i in range(0, len(corr.values)):
ax.arrow(0,
0,
corr.values[i, eje1]*0.95, # 0 para PC1
corr.values[i, eje2]*0.95, # 1 fpara PC2
head_width=0.05,
head_length=0.05)
plt.text(corr.values[i, eje1] + 0.05,
corr.values[i, eje2] + 0.05,
corr.index.values[i])
an = np.linspace(0, 2 * np.pi, 100)
plt.plot(np.cos(an), np.sin(an),color="b") # Agrega el círculo
plt.axis('equal')
ax.set_title('Círculo de Correlaciones')
circulo(datos)
open_close_plot()
pca = prince.PCA(n_components=5)
pca = pca.fit(datos)
# Plotea el plano principal en "x_component=0, y_component=2"
pca.plot_row_coordinates(datos,x_component=0, y_component=2,labels=datos.index,ellipse_fill=True)
open_close_plot()
circulo(datos,eje1=0,eje2=2)
open_close_plot()
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
iris = pd.read_csv('iris.csv',delimiter=';',decimal=".")
print(iris)
s.largo s.ancho p.largo p.ancho tipo
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
5 5.4 3.9 1.7 0.4 setosa
6 4.6 3.4 1.4 0.3 setosa
7 5.0 3.4 1.5 0.2 setosa
8 4.4 2.9 1.4 0.2 setosa
9 4.9 3.1 1.5 0.1 setosa
10 5.4 3.7 1.5 0.2 setosa
11 4.8 3.4 1.6 0.2 setosa
12 4.8 3.0 1.4 0.1 setosa
13 4.3 3.0 1.1 0.1 setosa
14 5.8 4.0 1.2 0.2 setosa
15 5.7 4.4 1.5 0.4 setosa
16 5.4 3.9 1.3 0.4 setosa
17 5.1 3.5 1.4 0.3 setosa
18 5.7 3.8 1.7 0.3 setosa
19 5.1 3.8 1.5 0.3 setosa
20 5.4 3.4 1.7 0.2 setosa
21 5.1 3.7 1.5 0.4 setosa
22 4.6 3.6 1.0 0.2 setosa
23 5.1 3.3 1.7 0.5 setosa
24 4.8 3.4 1.9 0.2 setosa
25 5.0 3.0 1.6 0.2 setosa
26 5.0 3.4 1.6 0.4 setosa
27 5.2 3.5 1.5 0.2 setosa
28 5.2 3.4 1.4 0.2 setosa
29 4.7 3.2 1.6 0.2 setosa
.. ... ... ... ... ...
120 6.9 3.2 5.7 2.3 virginica
121 5.6 2.8 4.9 2.0 virginica
122 7.7 2.8 6.7 2.0 virginica
123 6.3 2.7 4.9 1.8 virginica
124 6.7 3.3 5.7 2.1 virginica
125 7.2 3.2 6.0 1.8 virginica
126 6.2 2.8 4.8 1.8 virginica
127 6.1 3.0 4.9 1.8 virginica
128 6.4 2.8 5.6 2.1 virginica
129 7.2 3.0 5.8 1.6 virginica
130 7.4 2.8 6.1 1.9 virginica
131 7.9 3.8 6.4 2.0 virginica
132 6.4 2.8 5.6 2.2 virginica
133 6.3 2.8 5.1 1.5 virginica
134 6.1 2.6 5.6 1.4 virginica
135 7.7 3.0 6.1 2.3 virginica
136 6.3 3.4 5.6 2.4 virginica
137 6.4 3.1 5.5 1.8 virginica
138 6.0 3.0 4.8 1.8 virginica
139 6.9 3.1 5.4 2.1 virginica
140 6.7 3.1 5.6 2.4 virginica
141 6.9 3.1 5.1 2.3 virginica
142 5.8 2.7 5.1 1.9 virginica
143 6.8 3.2 5.9 2.3 virginica
144 6.7 3.3 5.7 2.5 virginica
145 6.7 3.0 5.2 2.3 virginica
146 6.3 2.5 5.0 1.9 virginica
147 6.5 3.0 5.2 2.0 virginica
148 6.2 3.4 5.4 2.3 virginica
149 5.9 3.0 5.1 1.8 virginica
[150 rows x 5 columns]
print(iris.head())
s.largo s.ancho p.largo p.ancho tipo
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
print(iris.shape)
(150, 5)
iris2 = pd.DataFrame(data=iris, columns=['s.largo', 's.ancho', 'p.largo', 'p.ancho'])
y = iris['tipo']
pca = prince.PCA(n_components=4)
pca = pca.fit(iris2)
# Plotea el plano principal
pca.plot_row_coordinates(iris2,labels=None,ellipse_fill=True,color_labels=y)
open_close_plot()
# Despliega las Componenentes Principales
print(pca.row_coordinates(iris2))
# Despliega los cosenos cuadrados
0 1 2 3
0 -2.264542 0.505704 -0.121943 -0.023073
1 -2.086426 -0.655405 -0.227251 -0.103208
2 -2.367950 -0.318477 0.051480 -0.027825
3 -2.304197 -0.575368 0.098860 0.066311
4 -2.388777 0.674767 0.021428 0.037397
5 -2.070537 1.518549 0.030684 -0.004399
6 -2.445711 0.074563 0.342198 0.038097
7 -2.233842 0.247614 -0.082574 0.025505
8 -2.341958 -1.095146 0.153562 0.026794
9 -2.188676 -0.448629 -0.246560 0.039907
10 -2.163487 1.070596 -0.264009 -0.015301
11 -2.327378 0.158587 0.100166 0.134554
12 -2.224083 -0.709118 -0.223215 -0.002631
13 -2.639716 -0.938282 0.189570 0.019422
14 -2.192292 1.889979 -0.469480 -0.192782
15 -2.251465 2.722371 0.032604 -0.047126
16 -2.202750 1.513750 -0.001363 -0.186632
17 -2.190179 0.514304 -0.038616 -0.091906
18 -1.894074 1.431111 -0.370743 -0.059528
19 -2.339949 1.158033 0.137418 0.039828
20 -1.914556 0.430465 -0.416007 -0.010360
21 -2.204645 0.952457 0.164738 -0.057729
22 -2.774170 0.489517 0.338836 -0.017854
23 -1.820412 0.106751 0.040061 -0.150346
24 -2.228217 0.162186 0.124201 0.271229
25 -1.957024 -0.607893 -0.298591 -0.043837
26 -2.052063 0.266014 0.092093 -0.066601
27 -2.168194 0.552016 -0.201295 -0.009261
28 -2.140306 0.336640 -0.265315 -0.083544
29 -2.268790 -0.314879 0.075515 0.108849
.. ... ... ... ...
120 2.040496 0.907399 0.231878 -0.167140
121 0.973915 -0.571174 0.829504 -0.027319
122 2.898064 0.397791 -0.860927 0.126074
123 1.329194 -0.486761 -0.004707 -0.140598
124 1.704241 1.014148 0.295958 0.062740
125 1.957728 1.003335 -0.422817 0.218459
126 1.171905 -0.318897 0.130652 -0.125685
127 1.019781 0.065543 0.338042 0.009069
128 1.786009 -0.193273 0.270003 -0.031207
129 1.864778 0.555382 -0.717511 0.207557
130 2.435497 0.246654 -0.730234 0.016794
131 2.316082 2.626184 -0.499620 0.213160
132 1.860371 -0.184672 0.353330 -0.100039
133 1.111272 -0.295986 -0.182660 0.185740
134 1.197469 -0.817168 -0.163214 0.488404
135 2.800949 0.844748 -0.547001 -0.296321
136 1.580155 1.072474 0.943393 -0.033607
137 1.347044 0.422256 0.180029 0.215907
138 0.923433 0.019230 0.417394 -0.004744
139 1.853552 0.672423 -0.014820 -0.194875
140 2.016157 0.610397 0.425915 -0.246765
141 1.903117 0.686025 0.127799 -0.469214
142 1.153190 -0.701326 0.531465 0.040414
143 2.043308 0.864685 0.335266 -0.044278
144 2.001691 1.048550 0.629269 -0.212588
145 1.870522 0.382822 0.254532 -0.388890
146 1.558492 -0.905314 -0.025382 -0.221322
147 1.520845 0.266795 0.179277 -0.118903
148 1.376391 1.016362 0.931405 -0.024146
149 0.959299 -0.022284 0.528794 0.163676
[150 rows x 4 columns]
print(pca.row_cosine_similarities(iris2))
# Despliega las correlaciones de las variables con respecto a las componentes
0 1 2 3
0 0.949782 0.047365 2.754100e-03 0.000099
1 0.898483 0.088659 1.065897e-02 0.002199
2 0.981644 0.017757 4.639588e-04 0.000136
3 0.938948 0.058545 1.728413e-03 0.000778
4 0.925826 0.073873 7.449634e-05 0.000227
5 0.650148 0.349706 1.427833e-04 0.000003
6 0.979673 0.000911 1.917894e-02 0.000238
7 0.986404 0.012120 1.347849e-03 0.000129
8 0.817595 0.178782 3.515201e-03 0.000107
9 0.947832 0.039824 1.202852e-02 0.000315
10 0.793767 0.194373 1.182016e-02 0.000040
11 0.990258 0.004598 1.834222e-03 0.003310
12 0.899499 0.091440 9.060318e-03 0.000001
13 0.883740 0.111655 4.557737e-03 0.000048
14 0.556541 0.413632 2.552319e-02 0.004304
15 0.406058 0.593679 8.515175e-05 0.000178
16 0.675934 0.319214 2.589872e-07 0.004852
17 0.945883 0.052158 2.940380e-04 0.001666
18 0.621044 0.354548 2.379436e-02 0.000613
19 0.800858 0.196148 2.762027e-03 0.000232
20 0.910917 0.046049 4.300743e-02 0.000027
21 0.838284 0.156461 4.680617e-03 0.000575
22 0.955935 0.029764 1.426075e-02 0.000040
23 0.989370 0.003402 4.791522e-04 0.006748
24 0.977305 0.005178 3.036464e-03 0.014481
25 0.892645 0.086127 2.077974e-02 0.000448
26 0.980515 0.016477 1.974812e-03 0.001033
27 0.931569 0.060384 8.029474e-03 0.000017
28 0.960035 0.023750 1.475222e-02 0.001463
29 0.977831 0.018835 1.083295e-03 0.002251
.. ... ... ... ...
120 0.821438 0.162443 1.060774e-02 0.005511
121 0.483053 0.166146 3.504205e-01 0.000380
122 0.901727 0.016989 7.957770e-02 0.001707
123 0.873127 0.117093 1.095100e-05 0.009769
124 0.721696 0.255561 2.176467e-02 0.000978
125 0.756571 0.198718 3.528990e-02 0.009421
126 0.910763 0.067441 1.132016e-02 0.010476
127 0.897592 0.003708 9.862948e-02 0.000071
128 0.966305 0.011316 2.208425e-02 0.000295
129 0.800552 0.071010 1.185202e-01 0.009918
130 0.908924 0.009322 8.171022e-02 0.000043
131 0.427220 0.549281 1.988029e-02 0.003619
132 0.953455 0.009395 3.439242e-02 0.002757
133 0.888182 0.063009 2.399641e-02 0.024813
134 0.605834 0.282129 1.125483e-02 0.100782
135 0.876970 0.079768 3.344649e-02 0.009815
136 0.550193 0.253448 1.961102e-01 0.000249
137 0.875799 0.086058 1.564315e-02 0.022499
138 0.830036 0.000360 1.695818e-01 0.000022
139 0.875102 0.115169 5.594548e-05 0.009673
140 0.868609 0.079616 3.876334e-02 0.013012
141 0.836653 0.108716 3.772871e-03 0.050858
142 0.631518 0.233574 1.341322e-01 0.000776
143 0.828863 0.148433 2.231485e-02 0.000389
144 0.722279 0.198193 7.138111e-02 0.008147
145 0.906103 0.037953 1.677791e-02 0.039166
146 0.736450 0.248503 1.953361e-04 0.014852
147 0.951672 0.029287 1.322414e-02 0.005817
148 0.499126 0.272159 2.285616e-01 0.000154
149 0.749903 0.000405 2.278615e-01 0.021831
[150 rows x 4 columns]
print(pca.column_correlations(iris2))
# Valores Propios
0 1 2 3
p.ancho 0.964996 0.062786 0.243295 -0.075157
p.largo 0.991684 0.020247 0.054084 0.115009
s.ancho -0.449313 0.888351 0.092908 0.017820
s.largo 0.891224 0.357352 -0.276774 -0.037610
print(pca.eigenvalues_)
# Plotea el Círculo de Correlaciones
[436.6227125628082, 138.183139606084, 22.102991745764324, 3.091156085343784]
circulo(iris2)
open_close_plot()
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
datos = pd.read_csv('EjemploEstudiantes_Categoricas.csv',delimiter=';',decimal=",",index_col=0)
print(datos)
Matematicas Ciencias Espanol ... EdFisica Sexo Conducta
Lucia 7.0 6.5 9.2 ... 8.0 F 3
Pedro 7.5 9.4 7.3 ... 7.0 M 2
Ines 7.6 9.2 8.0 ... 7.5 F 2
Luis 5.0 6.5 6.5 ... 9.0 M 1
Andres 6.0 6.0 7.8 ... 7.3 M 2
Ana 7.8 9.6 7.7 ... 6.5 F 3
Carlos 6.3 6.4 8.2 ... 7.2 M 1
Jose 7.9 9.7 7.5 ... 6.0 M 1
Sonia 6.0 6.0 6.5 ... 8.7 F 2
Maria 6.8 7.2 8.7 ... 7.0 F 3
[10 rows x 7 columns]
print(datos.head())
Matematicas Ciencias Espanol ... EdFisica Sexo Conducta
Lucia 7.0 6.5 9.2 ... 8.0 F 3
Pedro 7.5 9.4 7.3 ... 7.0 M 2
Ines 7.6 9.2 8.0 ... 7.5 F 2
Luis 5.0 6.5 6.5 ... 9.0 M 1
Andres 6.0 6.0 7.8 ... 7.3 M 2
[5 rows x 7 columns]
print(datos.shape)
(10, 7)
print(datos.dtypes)
Matematicas float64
Ciencias float64
Espanol float64
Historia float64
EdFisica float64
Sexo object
Conducta int64
dtype: object
datos["Conducta"] = recodificar(datos["Conducta"], {1:'Mala',2:'Regular',3:'Buena'})
print(datos.head())
Matematicas Ciencias Espanol ... EdFisica Sexo Conducta
Lucia 7.0 6.5 9.2 ... 8.0 F Buena
Pedro 7.5 9.4 7.3 ... 7.0 M Regular
Ines 7.6 9.2 8.0 ... 7.5 F Regular
Luis 5.0 6.5 6.5 ... 9.0 M Mala
Andres 6.0 6.0 7.8 ... 7.3 M Regular
[5 rows x 7 columns]
print(datos.dtypes)
# Conviertiendo la variables en Dummy
Matematicas float64
Ciencias float64
Espanol float64
Historia float64
EdFisica float64
Sexo object
Conducta object
dtype: object
datos_dummy = pd.get_dummies(datos)
print(datos_dummy.head())
Matematicas Ciencias ... Conducta_Mala Conducta_Regular
Lucia 7.0 6.5 ... 0 0
Pedro 7.5 9.4 ... 0 1
Ines 7.6 9.2 ... 0 1
Luis 5.0 6.5 ... 1 0
Andres 6.0 6.0 ... 0 1
[5 rows x 10 columns]
print(datos_dummy.dtypes)
Matematicas float64
Ciencias float64
Espanol float64
Historia float64
EdFisica float64
Sexo_F uint8
Sexo_M uint8
Conducta_Buena uint8
Conducta_Mala uint8
Conducta_Regular uint8
dtype: object
pca = prince.PCA(n_components=5)
pca = pca.fit(datos_dummy)
# Plotea el plano principal
pca.plot_row_coordinates(datos_dummy,labels=datos.index,ellipse_fill=True)
open_close_plot()
# Despliega las Componenentes Principales
print(pca.row_coordinates(datos_dummy))
# Despliega los cosenos cuadrados
0 1 2 3 4
Lucia -2.345143 0.903291 -1.778812 0.148430 0.124922
Pedro 0.808131 -0.653515 2.292669 0.620970 -0.635678
Ines -1.132539 0.728146 1.621333 0.542058 1.143184
Luis 3.447112 0.467560 -1.409943 -1.306979 -0.206997
Andres 1.227141 -0.122009 -0.390823 2.264646 -0.499018
Ana -2.600893 -0.265107 0.736970 -1.203789 -0.661441
Carlos 1.190401 -1.750887 -1.683095 0.334195 0.579424
Jose 0.388347 -3.023783 1.018847 -0.942744 0.296697
Sonia 1.498038 3.467720 0.921524 -0.530161 0.138735
Maria -2.480596 0.248585 -1.328671 0.073375 -0.279829
print(pca.row_cosine_similarities(datos_dummy))
# Despliega las correlaciones de las variables con respecto a las componentes
0 1 2 3 4
Lucia 0.577855 0.085730 0.332460 0.002315 0.001640
Pedro 0.091645 0.059931 0.737609 0.054111 0.056704
Ines 0.212279 0.087748 0.435056 0.048629 0.216288
Luis 0.750155 0.013801 0.125500 0.107839 0.002705
Andres 0.213565 0.002111 0.021662 0.727346 0.035316
Ana 0.730155 0.007586 0.058623 0.156412 0.047223
Carlos 0.182542 0.394905 0.364917 0.014387 0.043248
Jose 0.013336 0.808500 0.091790 0.078590 0.007784
Sonia 0.145545 0.779901 0.055076 0.018229 0.001248
Maria 0.763045 0.007663 0.218914 0.000668 0.009710
print(pca.column_correlations(datos_dummy))
# Valores Propios
0 1 2 3 4
Ciencias -0.363301 -0.473786 0.715784 -0.279929 0.008210
Conducta_Buena -0.836286 0.119158 -0.363797 -0.211407 -0.326481
Conducta_Mala 0.565944 -0.578760 -0.318321 -0.412385 0.267603
Conducta_Regular 0.252882 0.429919 0.638062 0.583504 0.055076
EdFisica 0.471553 0.759836 -0.324426 -0.090167 0.107915
Espanol -0.755726 -0.220401 -0.424365 0.362187 0.172232
Historia -0.465278 -0.552918 -0.503507 0.421552 0.072565
Matematicas -0.683863 -0.404389 0.573295 -0.059553 0.107721
Sexo_F -0.728747 0.625952 0.024241 -0.191410 0.170651
Sexo_M 0.728747 -0.625952 -0.024241 0.191410 -0.170651
print(pca.eigenvalues_)
[37.55385737995785, 26.372739931848024, 20.218462903361623, 10.274293157561727, 2.9772458409115674]
circulo(datos_dummy)
open_close_plot()